In [1]:
# !pip install plotly-express
import pandas as pd
import numpy as np
from datetime import date
import math
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

import seaborn as sns
from matplotlib.colors import ListedColormap
from matplotlib import pyplot as plt
import plotly.express as px

%matplotlib inline

Data Handling¶

In [2]:
df=pd.read_csv('UDEMY_DATA.csv')
df
Out[2]:
best_seller course_rating_avarage course_price num_of_buyers_students amount_of_instructor_studnets num_articles video_time_length number_of_languages last_update instructor_rank amount_of_what_you_will_learn_count amount_of_requirments_count amount_of_companies_support course_rating_amount
0 1 4.7 69.90 493,315 1,415,946 230 60 13 12.2021 4.7 9 8 5 108,821
1 0 4.6 69.90 1,533,407 2,853,435 14 22 9 3.2021 4.6 12 6 5 418,238
2 0 4.7 89.90 150,857 201,184 Reviews 56 30.5 8 5.2022 4.7 18 7 5 36,525
3 0 4.6 89.90 375,779 53 Courses 12 70 8 3.2022 4.5 7 7 5 90,083
4 1 4.6 59.90 542,316 2,853,435 13 25 9 5.2020 4.6 16 6 5 115,128
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
26065 0 4.5 59.90 11 112 -1 1 hour on-demand video 2 7.2021 4.0 6 4 0 1 rating)
26066 0 3.7 59.90 68 6,057 1 19.5 2 5.2022 3.4 23 6 0 14
26067 0 3.9 699.90 15 40 -1 17.5 2 6.2021 3.6 8 5 0 2
26068 0 4.4 59.90 3,063 6,956 -1 3 2 4.2018 4.1 35 6 0 164
26069 0 4.6 59.90 596 596 1 10 2 1.2021 4.6 10 6 0 128

26070 rows × 14 columns

Markdown:

  • At this section, we removed the rows which includes empty fields. While we were crawling the data, whole data that failed for any reason, we filled its value as -1, so it helped us to easilly clear those empty values.
  • After cleaning, we used the reset_index command in order to get all the id's in the order and to avoid of list errors in the future.
In [3]:
columns = ["best_seller", "course_rating_avarage", "course_price", "num_of_buyers_students", "amount_of_instructor_studnets", "num_articles", "video_time_length", "number_of_languages", "last_update", "instructor_rank", "amount_of_what_you_will_learn_count", "amount_of_requirments_count", "amount_of_companies_support", "course_rating_amount"]
for column  in columns:
    df.drop(df.index[df[column] == -1], inplace=True)
df = df.reset_index(drop=True)
df
Out[3]:
best_seller course_rating_avarage course_price num_of_buyers_students amount_of_instructor_studnets num_articles video_time_length number_of_languages last_update instructor_rank amount_of_what_you_will_learn_count amount_of_requirments_count amount_of_companies_support course_rating_amount
0 1 4.7 69.90 493,315 1,415,946 230 60 13 12.2021 4.7 9 8 5 108,821
1 0 4.6 69.90 1,533,407 2,853,435 14 22 9 3.2021 4.6 12 6 5 418,238
2 0 4.7 89.90 150,857 201,184 Reviews 56 30.5 8 5.2022 4.7 18 7 5 36,525
3 0 4.6 89.90 375,779 53 Courses 12 70 8 3.2022 4.5 7 7 5 90,083
4 1 4.6 59.90 542,316 2,853,435 13 25 9 5.2020 4.6 16 6 5 115,128
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10881 0 4.3 59.90 3,754 40,786 1 5 2 1.2022 4.4 7 6 5 609
10882 0 4.6 59.90 16,937 68,564 2 24 3 4.2019 4.6 4 7 5 2,138
10883 0 4.5 59.90 1,963 848,521 14 1.5 2 5.2020 4.4 6 5 0 2
10884 0 3.7 59.90 68 6,057 1 19.5 2 5.2022 3.4 23 6 0 14
10885 0 4.6 59.90 596 596 1 10 2 1.2021 4.6 10 6 0 128

10886 rows × 14 columns

Markdown:

  • In the next lines, we had to take care of the formating, the data type and other creterias.
  • In order to do so, we cleared the words that were most common such as 'reviews', 'courses' and other, so we will have a string that includes only numbers.
  • After we got the string that includes numbers, we could simply conver it to float or int.
  • In order to convert 'too big' string, we had to convert it to Int64.
  • As you can see in the table below, there are only numbers which are integers or float and now we will be able to analyse the data using math and visualisation functions.
In [4]:
df['amount_of_instructor_studnets'] = df['amount_of_instructor_studnets'].str.replace(" Reviews", "")
df['amount_of_instructor_studnets'] = df['amount_of_instructor_studnets'].str.replace(" Courses", "")
df['amount_of_instructor_studnets'] = df['amount_of_instructor_studnets'].str.replace(" Course", "")
df['amount_of_instructor_studnets'] = df['amount_of_instructor_studnets'].str.replace(" Review", "")
df['amount_of_instructor_studnets'] = df['amount_of_instructor_studnets'].str.replace("1Student", "")
df['amount_of_instructor_studnets'] = df['amount_of_instructor_studnets'].str.replace("1 Student", "")
df['amount_of_instructor_studnets'] = df['amount_of_instructor_studnets'].str.replace(",", "")
df['amount_of_instructor_studnets'] = df['amount_of_instructor_studnets'].str.replace("--", "")


df["amount_of_instructor_studnets"] = pd.to_numeric(df["amount_of_instructor_studnets"], downcast='integer')
df["amount_of_instructor_studnets"] = df["amount_of_instructor_studnets"].astype('Int64')


df['amount_of_instructor_studnets']
Out[4]:
0        1415946
1        2853435
2         201184
3             53
4        2853435
          ...   
10881      40786
10882      68564
10883     848521
10884       6057
10885        596
Name: amount_of_instructor_studnets, Length: 10886, dtype: Int64
In [5]:
df["last_update"].drop(df.index[(((df["last_update"].str.len() - df["last_update"].str.index("."))-1) != 4)], inplace=True, axis=0)
df['last_update'] = df['last_update'].str.replace("Published ", "")

total_diffrence = []
for row in df["last_update"]:
    month_diffrence = (int(date.today().month) - int(row.split(".",1)[0])) * 30
    year_diffrence = (int(date.today().year) - int(row.split(".",1)[1])) * 365
    total_diffrence.append(month_diffrence + year_diffrence)

df["last_update"] = total_diffrence
df["last_update"]
Out[5]:
0         185
1         455
2          30
3          90
4         760
         ... 
10881     150
10882    1155
10883     760
10884      30
10885     515
Name: last_update, Length: 10886, dtype: int64
In [6]:
df['course_rating_amount'] = df['course_rating_amount'].str.replace(",", "")
df['course_rating_amount'] = df['course_rating_amount'].str.replace("1 rating", "0")
df['course_rating_amount'] = df['course_rating_amount'].str.replace(")", "")

df["course_rating_amount"] = pd.to_numeric(df["course_rating_amount"], downcast='integer')
df["course_rating_amount"] = df["course_rating_amount"].astype('Int64')

df['course_rating_amount']
D:\Anaconda\Anaconda3\envs\mlcourse\lib\site-packages\ipykernel_launcher.py:3: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
  This is separate from the ipykernel package so we can avoid doing imports until
Out[6]:
0        108821
1        418238
2         36525
3         90083
4        115128
          ...  
10881       609
10882      2138
10883         2
10884        14
10885       128
Name: course_rating_amount, Length: 10886, dtype: Int64
In [7]:
df['num_of_buyers_students'] = df['num_of_buyers_students'].str.replace(",", "")
df['num_of_buyers_students'] = df['num_of_buyers_students'].str.replace("1 student", "1")

df["num_of_buyers_students"] = pd.to_numeric(df["num_of_buyers_students"], downcast='integer')
df["num_of_buyers_students"] = df["num_of_buyers_students"].astype('Int64')

df['num_of_buyers_students']
Out[7]:
0         493315
1        1533407
2         150857
3         375779
4         542316
          ...   
10881       3754
10882      16937
10883       1963
10884         68
10885        596
Name: num_of_buyers_students, Length: 10886, dtype: Int64
In [8]:
df["course_price"] = pd.to_numeric(df["course_price"], downcast='float')
df["course_price"] = df["course_price"].astype('float64')

df['course_price']
Out[8]:
0        69.900002
1        69.900002
2        89.900002
3        89.900002
4        59.900002
           ...    
10881    59.900002
10882    59.900002
10883    59.900002
10884    59.900002
10885    59.900002
Name: course_price, Length: 10886, dtype: float64
In [9]:
df['video_time_length'] = df['video_time_length'].str.replace(" mins on-demand video", "")
df['video_time_length'] = df['video_time_length'].str.replace(" hour on-demand video", "")


df["video_time_length"] = pd.to_numeric(df["video_time_length"], downcast='float')
df["video_time_length"] = df["video_time_length"].astype('float64')

df['video_time_length']
Out[9]:
0        60.0
1        22.0
2        30.5
3        70.0
4        25.0
         ... 
10881     5.0
10882    24.0
10883     1.5
10884    19.5
10885    10.0
Name: video_time_length, Length: 10886, dtype: float64
In [10]:
 df['instructor_rank'] = df['instructor_rank'].str.replace("--", "0")
    
df["instructor_rank"] = pd.to_numeric(df["instructor_rank"], downcast='float')
df["instructor_rank"] = df["instructor_rank"].astype('float64')

df['instructor_rank']
Out[10]:
0        4.7
1        4.6
2        4.7
3        4.5
4        4.6
        ... 
10881    4.4
10882    4.6
10883    4.4
10884    3.4
10885    4.6
Name: instructor_rank, Length: 10886, dtype: float64
In [11]:
columns = ["best_seller", "course_rating_avarage", "course_price", "num_of_buyers_students", "amount_of_instructor_studnets", "num_articles", "video_time_length", "number_of_languages", "last_update", "instructor_rank", "amount_of_what_you_will_learn_count", "amount_of_requirments_count", "amount_of_companies_support", "course_rating_amount"]
for column  in columns:
    df[column] = df[column].round(2)
df
Out[11]:
best_seller course_rating_avarage course_price num_of_buyers_students amount_of_instructor_studnets num_articles video_time_length number_of_languages last_update instructor_rank amount_of_what_you_will_learn_count amount_of_requirments_count amount_of_companies_support course_rating_amount
0 1 4.7 69.9 493315 1415946 230 60.0 13 185 4.7 9 8 5 108821
1 0 4.6 69.9 1533407 2853435 14 22.0 9 455 4.6 12 6 5 418238
2 0 4.7 89.9 150857 201184 56 30.5 8 30 4.7 18 7 5 36525
3 0 4.6 89.9 375779 53 12 70.0 8 90 4.5 7 7 5 90083
4 1 4.6 59.9 542316 2853435 13 25.0 9 760 4.6 16 6 5 115128
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10881 0 4.3 59.9 3754 40786 1 5.0 2 150 4.4 7 6 5 609
10882 0 4.6 59.9 16937 68564 2 24.0 3 1155 4.6 4 7 5 2138
10883 0 4.5 59.9 1963 848521 14 1.5 2 760 4.4 6 5 0 2
10884 0 3.7 59.9 68 6057 1 19.5 2 30 3.4 23 6 0 14
10885 0 4.6 59.9 596 596 1 10.0 2 515 4.6 10 6 0 128

10886 rows × 14 columns

In [12]:
df.describe(include='all')
Out[12]:
best_seller course_rating_avarage course_price num_of_buyers_students amount_of_instructor_studnets num_articles video_time_length number_of_languages last_update instructor_rank amount_of_what_you_will_learn_count amount_of_requirments_count amount_of_companies_support course_rating_amount
count 10886.000000 10886.000000 10886.000000 1.088600e+04 1.086400e+04 10886.000000 10886.000000 10886.000000 10886.000000 10886.000000 10886.000000 10886.000000 10886.000000 10886.000000
mean 0.078082 4.223278 64.202407 1.271581e+04 1.815136e+05 8.607110 12.058975 2.302958 517.665350 4.316213 8.482914 5.871027 1.335660 1573.600772
std 0.268313 0.807010 45.094440 4.390705e+04 3.641146e+05 18.917087 15.344214 1.321281 592.760663 0.469917 7.611141 0.605663 2.212411 10046.506905
min 0.000000 0.000000 53.900000 0.000000e+00 1.000000e+00 1.000000 1.000000 2.000000 0.000000 0.000000 1.000000 5.000000 0.000000 0.000000
25% 0.000000 4.100000 59.900000 2.660000e+02 3.822750e+03 1.000000 3.500000 2.000000 90.000000 4.200000 4.000000 6.000000 0.000000 22.000000
50% 0.000000 4.400000 59.900000 1.743000e+03 2.570500e+04 3.000000 7.000000 2.000000 275.000000 4.400000 6.000000 6.000000 0.000000 81.000000
75% 0.000000 4.600000 59.900000 9.132750e+03 1.566728e+05 8.000000 14.500000 2.000000 760.000000 4.600000 10.000000 6.000000 5.000000 412.000000
max 1.000000 5.000000 699.900000 1.533407e+06 2.885344e+06 333.000000 194.500000 16.000000 3710.000000 5.000000 175.000000 9.000000 5.000000 418238.000000

Markdown:

  • Now, we have to clear the outliner. If we wouldnt, it will have huge impact on some parameters such as max, average and others. In addition, in future, when we would like to change some of the numbers to bins, we will get a problem because most of the information would be in 1 small bin while there will be a lot of empty bins until we get the high outliner. Same goes to the lower. We decided to use the IQR creteria tactic.
In [13]:
# handle outliers
for col in df.columns:
    if (df.dtypes[col] == int) or (df.dtypes[col] == float):
        q1 = np.percentile(df[col], 25)
        q3 = np.percentile(df[col], 75)
        iqr_value = q3 - q1
        df[col] = np.where(df[col]<q1 -1.5*iqr_value, np.nan, df[col])
        df[col] = np.where(df[col]>q3 +1.5*iqr_value, np.nan, df[col])

Markdown:

  • In addition, we got some duplicated lines. In order to hundle it, we will use the drop_duplicates that clears the duplicated lines.
In [14]:
df.drop_duplicates(subset=None, keep='first', inplace=True, ignore_index=False)
In [15]:
df = df.dropna(how='any')
In [16]:
df.info()
df.reset_index(drop=True, inplace=True)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5889 entries, 4 to 10885
Data columns (total 14 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   best_seller                          5889 non-null   int64  
 1   course_rating_avarage                5889 non-null   float64
 2   course_price                         5889 non-null   float64
 3   num_of_buyers_students               5889 non-null   Int64  
 4   amount_of_instructor_studnets        5889 non-null   Int64  
 5   num_articles                         5889 non-null   int64  
 6   video_time_length                    5889 non-null   float64
 7   number_of_languages                  5889 non-null   int64  
 8   last_update                          5889 non-null   int64  
 9   instructor_rank                      5889 non-null   float64
 10  amount_of_what_you_will_learn_count  5889 non-null   int64  
 11  amount_of_requirments_count          5889 non-null   int64  
 12  amount_of_companies_support          5889 non-null   int64  
 13  course_rating_amount                 5889 non-null   Int64  
dtypes: Int64(3), float64(4), int64(7)
memory usage: 707.4 KB
In [17]:
columns = ["best_seller", "course_rating_avarage", "course_price", "num_of_buyers_students", "amount_of_instructor_studnets", "num_articles", "video_time_length", "number_of_languages", "last_update", "instructor_rank", "amount_of_what_you_will_learn_count", "amount_of_requirments_count", "amount_of_companies_support", "course_rating_amount"]
file_name = 'UDEMY_DATA_after_cleaning.csv'
df.to_csv(file_name, header=columns, index=False)

EDA & Visualiztion¶

In [18]:
pok = pd.read_csv('UDEMY_DATA_after_cleaning.csv', header=0, sep=',') 
In [19]:
pok.head()
Out[19]:
best_seller course_rating_avarage course_price num_of_buyers_students amount_of_instructor_studnets num_articles video_time_length number_of_languages last_update instructor_rank amount_of_what_you_will_learn_count amount_of_requirments_count amount_of_companies_support course_rating_amount
0 1 4.6 59.9 542316 2853435 13 25.0 9 760 4.6 16 6 5 115128
1 0 4.6 59.9 136543 59799 4 9.0 8 425 4.7 8 7 5 40180
2 0 4.5 59.9 224747 547988 1 13.5 2 30 4.5 18 7 5 20177
3 0 4.5 59.9 93333 704228 1 2.5 7 30 4.5 1 6 5 26088
4 0 4.6 59.9 10352 395969 39 20.0 2 60 4.5 7 6 5 478
In [20]:
# Counts of Best Sellers
fig = px.pie(df, names = "best_seller",
             title = "<b>Counts in best_seller</b>",             
             color_discrete_sequence=px.colors.sequential.Blackbody_r,             
             hole = 0.5)

fig.update_layout(title_x = 0.5,
                  title_font = dict(size = 20))

fig.update_traces(textposition='inside',
                  textinfo='percent+label',    
                  textfont_size=15,                  
                  marker=dict(line=dict(color='#000000', width = 1.5)))


fig.show()

markdown:

  • At this point, we decided to show hitmap(like HIT map :) ).
  • The colors show how strong is the connection between 2 parameters(cols), while as much as it closer to 1, as much as the connection stronger. It is a good way to sum up the corr numbers.
In [21]:
# Heatmap
plt.figure(figsize=(10,7))
sns.heatmap(df.corr())
Out[21]:
<AxesSubplot:>
In [22]:
df.groupby('best_seller').agg({"num_of_buyers_students":'max',"amount_of_instructor_studnets":'max'}).plot(kind = 'bar', rot=1, figsize=(12,5))

plt.title("Visualizing Maximum Number of Buyers & Amount of Instructors\n", size=15)
plt.grid()
plt.legend(loc='upper left')
plt.xlabel("Best sellers", size=12)
plt.ylabel("Maximum Count", size=12)
plt.show()

Markdown:

  • Its clear that the best seller has maximum number of students and little low number of instructors
In [23]:
df.groupby('best_seller').agg({'video_time_length':'max',
                               'number_of_languages':'max'}).plot(kind = 'bar', rot=1, figsize=(12,5))

plt.title("Visualizing Counts Video time length & Number of languages\n", size=15)
plt.grid()
plt.xlabel("Best sellers", size=12)
plt.ylabel("Maximum Count", size=12)
plt.show()

Markdown:

  • The best seller has lower counts of video length and number of languages as compare to non best sellers at max
In [24]:
df.groupby('best_seller').agg({'course_rating_amount':'mean',
                               'num_of_buyers_students':'mean'}).plot(kind = 'bar', rot=1, figsize=(12,5))

plt.title("Visualizing Average Course rating amount & Number of Buyer students\n", size=15)
plt.grid()
plt.xlabel("Best sellers", size=12)
plt.ylabel("Mean", size=12)
plt.show()
In [25]:
df.groupby('best_seller').agg({'video_time_length':'mean',
                               'number_of_languages':'mean'}).plot(kind = 'bar', rot=1, figsize=(12,5))

plt.title("Visualizing Average Video time length & Number of languages\n", size=15)
plt.grid()
plt.xlabel("Best sellers", size=12)
plt.ylabel("Maximum Count", size=12)
plt.show()

Markdown:

  • Its clear that the best seller has higher time of video length and higher number of languages as compare to non best sellers at mean
In [26]:
df.groupby('best_seller').agg({'num_of_buyers_students':'max'}).plot(kind = 'bar', rot=1, figsize=(12,5))
plt.title("Visualizing Number of buyer students\n", size=15)
plt.grid()
plt.xlabel("Best sellers", size=12)
plt.ylabel("Maximum Count", size=12)
plt.show()

Markdown:

  • We can see that if you have 500000 buyers, you would probably be best seller.
In [27]:
df.groupby('best_seller').agg({'number_of_languages':'max'}).plot(kind = 'bar', rot=1, figsize=(12,5))

plt.title("Visualizing Maximum Number of languages\n", size=15)
plt.grid()
plt.xlabel("Best sellers", size=12)
plt.ylabel("Maximum Count", size=12)
plt.show()
In [28]:
df.groupby('best_seller').agg({'course_rating_avarage':'mean',
                               'instructor_rank':'min'}).plot(kind = 'bar', rot=1, figsize=(12,5))

plt.title("Visualizing Average Course rating & Minimum Instructor Rank\n", size=15)
plt.grid()
plt.xlabel("Best sellers", size=12)
plt.ylabel("Mean Count", size=12)
plt.show()

Markdown:

  • In average,the minium of instructers rank should be 4 in order to become a best seller!
In [29]:
df.groupby('best_seller').agg({'num_articles':'mean',
                               'video_time_length':'mean'}).plot(kind = 'bar', rot=1, figsize=(12,5))

plt.title("Visualizing Average Number of articles & Video time length\n", size=15)
plt.grid()
plt.xlabel("Best sellers", size=12)
plt.ylabel("Mean Count", size=12)
plt.show()

Markdown:

  • How surprising! In average, in order to upper your chances to become a best seller, you will need about 12 articles and videoe time length, and way higher than 7.
In [30]:
df.groupby('best_seller').agg({'course_rating_avarage':'mean',
                               'amount_of_companies_support':'mean'}).plot(kind = 'bar', rot=1, figsize=(12,5))

plt.title("Visualizing Average Course Rating & Amount of Companies supports\n", size=15)
plt.grid()
plt.xlabel("Best sellers", size=12)
plt.ylabel("Count", size=12)
plt.show()
In [31]:
correlations = []
for i in range(1,df.corr().values.shape[0]):
    for j in range(1,df.corr().values.shape[1]):
        if i < j and df.corr().values[i][j] >= 0.5:
            correlations.append(df.corr().values[i][j])
tuple_arr = []
for i in range(1,df.corr().values.shape[0]):
    for j in range(1,df.corr().values.shape[1]):
        if i < j and df.corr().values[i][j] >= 0.5:
            tuple_arr.append((i,j))
print(correlations)
print(tuple_arr)

cols_for_correlations = ['best_seller', 'course_rating_avarage', 'course_price', 'num_of_buyers_students', 'amount_of_instructor_studnets', 'num_articles', 'video_time_length'
,'number_of_languages', 'last_update', 'instructor_rank', 'amount_of_what_you_will_learn_count', 'amount_of_requirments_count','amount_of_companies_support', 'course_rating_amount']
indx_sort = np.argsort(correlations)
for n_correlation in indx_sort:
    col_lt, col_rt = tuple_arr[n_correlation]
    col_name_lt, col_name_rt = cols_for_correlations[col_lt], cols_for_correlations[col_rt]
    title = "corr('%s', '%s')=%4.2f" %(col_name_lt, col_name_rt, correlations[n_correlation]) 
    print(title)
[0.5872909486255095, 0.6704233870682552, 0.5764588553470059]
[(1, 9), (3, 13), (7, 13)]
corr('number_of_languages', 'course_rating_amount')=0.58
corr('course_rating_avarage', 'instructor_rank')=0.59
corr('num_of_buyers_students', 'course_rating_amount')=0.67

Markdown:

  • After looking at all the visualisations, we would analyse the information in order to create the strongest modules for the machine learning. In the end of the machine learning, we would add a conlusion based on the visualisation and the learning results.

Machine Learning¶

splitting to bins¶

  • In order to improve our machine learning models, we decided to devide some of the parameters(cols) to bins.
  • Intead of filling the bin and labels credintals manually, we created a function that gets the num of bins and deviding it equally.
In [32]:
#verly low - very high

RANGE = 5
labels=[]
for i in range(RANGE):
    labels.append(i + 1)

bins=[]
divider_value = df['num_of_buyers_students'].max() / RANGE
bin_val=0
for value in range(RANGE):
    bins.append(math.floor(bin_val))
    bin_val = bin_val + divider_value
bins.append(df['num_of_buyers_students'].max())

df['num_of_buyers_students'] = pd.cut(df['num_of_buyers_students'], bins=bins, labels=labels)
D:\Anaconda\Anaconda3\envs\mlcourse\lib\site-packages\ipykernel_launcher.py:16: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [33]:
#verly low , low, average, high, very high
#1         ,2   ,3       ,4    ,5

RANGE = 5
labels=[]
for i in range(RANGE):
    labels.append(i + 1)

bins=[]
divider_value = df['amount_of_instructor_studnets'].max() / RANGE
bin_val=0
for value in range(RANGE):
    bins.append(math.floor(bin_val))
    bin_val = bin_val + divider_value
bins.append(df['amount_of_instructor_studnets'].max())

df['amount_of_instructor_studnets'] = pd.cut(df['amount_of_instructor_studnets'], bins=bins, labels=labels)
D:\Anaconda\Anaconda3\envs\mlcourse\lib\site-packages\ipykernel_launcher.py:17: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Spliting data¶

  • Since we try to predict the categorical column called 'best_seller' (1-best seller, 0- not best seller) we splitted the data and extracted the best_seller column to 'y' variable and the rest of the columns to X variable
  • After splitting the data to X and y, we splitted it again to XTrain, XTest, yTrain, yTest using train_test_split function of sklearn. we used the random_state=42(standard number) for randomize the instances and test_size=0.2 in order to get 80% of the data to train and 20% to the test(standard).
In [34]:
df1 = df.copy()
# df1 = df1.drop(['course_rating_avarage', 'last_update', 'course_rating_amount'], axis=1)
# X = df1.loc[:, df1.columns != 'best_seller']
X = df1.loc[:, df.columns != 'best_seller']
y = df1['best_seller']
In [35]:
XTrain, XTest, yTrain, yTest = train_test_split(X, y, random_state=42, test_size=0.2)

Models¶

  • After preparing the dataset as best as possible it's now the time to try diffrent models and which one predicts the best results.
  • We start with the KNN, then Decision tree and Naive bayes
  • for each model we show the confusion matrix, Accuracy, Precision, Recall, f1_score

KNN¶

In [36]:
clf = KNeighborsClassifier(n_neighbors=3)
In [37]:
clf.fit(XTrain, yTrain)
y_pred=clf.predict(XTest)
In [38]:
print('confusion matrix:\n ',metrics.confusion_matrix(y_true = yTest, y_pred = y_pred))
scores = cross_val_score(clf, X, y, cv=10)
print("Accuracy: %0.2f" % scores.mean())
print("Recall: %0.2f" % metrics.recall_score(y_true = yTest, y_pred = y_pred))
print("Precision: %0.2f" % metrics.precision_score(y_true = yTest, y_pred = y_pred))
print("F1: %0.2f" % metrics.f1_score(y_true = yTest, y_pred = y_pred))
confusion matrix:
  [[1068   28]
 [  54   28]]
Accuracy: 0.93
Recall: 0.34
Precision: 0.50
F1: 0.41
  • Conclusions:
    • The accuracy is very high but when we look in the Recall we understand that the amount of the True Positive(TP) from all the correct answers is very low (under 0.5) which means the model is dealing very well with the prediction of the best seller
    • The precision is average and not really bad
    • Because the F1 score is affected both by the recall and Precision

Decision Tree¶

In [39]:
decisionTree = tree.DecisionTreeClassifier()
In [40]:
decisionTree = decisionTree.fit(XTrain, yTrain)
y_pred = decisionTree.predict(XTest)
In [41]:
print('confusion matrix:\n ',metrics.confusion_matrix(y_true = yTest, y_pred = y_pred))
print('Accuracy: ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))
print("Recall: %0.2f" % metrics.recall_score(y_true = yTest, y_pred = y_pred))
print("Precision: %0.2f" % metrics.precision_score(y_true = yTest, y_pred = y_pred))
print("F1: %0.2f" % metrics.f1_score(y_true = yTest, y_pred = y_pred))
confusion matrix:
  [[1052   44]
 [  38   44]]
Accuracy:  0.9303904923599321
Recall: 0.54
Precision: 0.50
F1: 0.52
  • Conclusions:
    • The model is very good, all the measures are above 0.5 which means that the predictions of the model are likely to be right.

Naive Bayes¶

In [42]:
gnb = GaussianNB()
In [43]:
gnb.fit(XTrain,yTrain)
y_pred = gnb.predict(XTest)
In [44]:
print('confusion matrix:\n ',metrics.confusion_matrix(y_true = yTest, y_pred = y_pred))
print('Accuracy: ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))
print("Recall: %0.2f" % metrics.recall_score(y_true = yTest, y_pred = y_pred))
print("Precision: %0.2f" % metrics.precision_score(y_true = yTest, y_pred = y_pred))
print("F1: %0.2f" % metrics.f1_score(y_true = yTest, y_pred = y_pred))
confusion matrix:
  [[1035   61]
 [  64   18]]
Accuracy:  0.8938879456706282
Recall: 0.22
Precision: 0.23
F1: 0.22
  • Conclusions:
    • The accuracy is high but when we look in the Recall and the Precision we see they are really bad compare to the other models which set this model as the worst one to use.

Summary


  • The best model to predict the best seller is Decision Tree.
  • Only 6.72% sellers are best sellers
  • Best seller has maximum number of students and low number of instructors
  • Best seller has lower counts of video length and number of languages as compare to non best sellers
  • Course rating amount has number of buyer students are higher for Best sellers
  • Best sellers has lower number of language max, but higher in average !!!
  • Minimum instructor rank in avarage 4 for best sellers, but we can see that if your rating is 3.6 or lower, there will no be any chance to get the best seller.
  • Best sellers has higher numbers of articles and higher average video length as compare to non best sellers
  • Best sellers has higher number of companies that supports compare to non best sellers

  • So, if you would like to succeed with your video and to get higher exposure, we would reccomend you to create a lot of videos, to teach the course at least in a few languages and to plan anything before publishing. Because as we can see, for best seller it is not that important to update the course sometimes, but you would like to rush as higher rating as you can from start.